home *** CD-ROM | disk | FTP | other *** search
- Subject: v06i050: Binary search for strings in a file (bsearchstr)
- Newsgroups: mod.sources
- Approved: rs@mirror.UUCP
-
- Submitted by: hplabs!hpfcla!ajs
- Mod.sources: Volume 6, Issue 50
- Archive-name: bsearchstr
-
- [ I compiled and tested this on a 4.2BSD Vax750 and had no problems.
- The README is right -- just do a "vanilla cc"; add -DDEBUG to
- get a test stub with main. --r$ ]
-
- #!/bin/sh
- # This is a shell archive. Remove anything before this line,
- # then unpack it by saving it in a file and typing "sh file".
- #
- # Wrapped by mirror!rs on Fri Jul 11 15:00:17 EDT 1986
- # Contents: README bsearchstr.3 bsearchstr.c
-
- echo x - README
- sed 's/^XX//' > "README" <<'@//E*O*F README//'
- XXInformation on bsearchstr(3):
-
- XX1. This library routine is like bsearch(3), but generalized to work
- XX with files (not memory images) of random length strings (not fixed
- XX length objects).
-
- XX2. Unlike bsearch(), it is well-behaved. If more than one data item
- XX (text line) matches the pattern, it returns the first matching item,
- XX not a random one.
-
- XX3. It's been around a while, well tested, and shown itself to be
- XX generally useful, mainly for lookups into large, sorted lists.
-
- XX4. It runs on HP-UX (AT&T V.2); hasn't been tested on other variants.
-
- XX5. No makefile provided or needed -- compilation and linking are
- XX vanilla.
-
- XX6. No warranty express or implied accompanies this software. Caveat
- XX emptor. See the manual entry.
-
- XXAlan Silverstein, Hewlett-Packard Systems Software Operation, Fort Collins,
- XXColorado; {ihnp4 | hplabs}!hpfcla!ajs; 303-229-3053; (lat-long on request :-)
- @//E*O*F README//
- chmod u=rw,g=rw,o=rw README
-
- echo x - bsearchstr.3
- sed 's/^XX//' > "bsearchstr.3" <<'@//E*O*F bsearchstr.3//'
- XX.TH BSEARCHSTR 3 "HP Experimental"
- XX.SH NAME
- XXbsearchstr \- binary search a sorted file of random-length strings
- XX.SH SYNOPSIS
- XX.B "long bsearchstr (stream, pattern, caseins, tellnext)"
- XX.br
- XX.B "FILE\0\(**stream;"
- XX.br
- XX.B "char\0\(**pattern;"
- XX.br
- XX.B "int\0\0caseins;"
- XX.br
- XX.B "int\0\0tellnext;"
- XX.ad b
- XX.SH HP-UX COMPATIBILITY
- XX.TP 10
- XXLevel:
- XXHP-UX/RUN ONLY
- XX.TP
- XXOrigin:
- XXHewlett-Packard
- XX.SH DESCRIPTION
- XX.B Bsearchstr
- XXis a binary search routine which understands ordinary files containing sorted,
- XXrandom-length strings (lines) terminated by newline characters.
- XXIt returns the byte offset of the first line in the file which begins with the
- XXgiven pattern, taken as a simple string of characters (regular expressions are
- XXnot supported).
- XXIt also sets the file location to the line beginning at the offset returned,
- XXusing \fIfseek\fR\^(3s).
- XX.PP
- XX\fIStream\fR must be a currently open file containing lines previously sorted
- XXin increasing order by the simple numeric values of the characters.
- XX.PP
- XX\fIPattern\fR must be a valid pointer to char.
- XXA nil pattern (e.g. \(**pattern == 0) matches anything, so
- XX.B bsearchstr
- XXreturns zero (start of file) in this case.
- XXA nil line in the file (e.g. nothing but a newline) is treated as being less
- XXthan any non-nil pattern.
- XX.PP
- XXIf
- XX.I caseins
- XXis zero, searching is done case-sensitively, and the data file should be sorted
- XXaccordingly.
- XXIf
- XX.I caseins
- XXis non-zero, searching is done case-insensitively, and the data must have been
- XXsorted using "sort \-f" or equivalent.
- XXThe routine uses
- XX.IR malloc (3)
- XXto get memory to make an uppercased copy of
- XX.IR pattern ,
- XXso as not to modify the passed-in value.
- XX.PP
- XXIf \fItellnext\fR is zero, and no line in the file begins with \fIpattern\fR,
- XX.B bsearchstr
- XXreturns \-1.
- XXIf \fItellnext\fR is non-zero, it returns the offset of the first line
- XXlexically after \fIpattern\fR.
- XXIt returns the size of the file if the last line is before \fIpattern\fR.
- XX.PP
- XXEach time
- XX.B bsearchstr
- XXhalves the remainder of the file, it seeks to a
- XXlocation which is usually not the start of a line.
- XXIt searches forwards, then backwards, a character at a time, to locate the
- XXnext start of line.
- XXThis is normally more efficient than linear searching the file, especially if
- XXthe file is large and no lines are extremely long.
- XXAlso,
- XX.B bsearchstr
- XXworks fine even if the last line in the file is not
- XXterminated by a newline.
- XX.SH SEE ALSO
- XXsort(1), bsearch(3c), fgets(3s), fopen(3s), hsearch(3c), lsearch(3c),
- XXqsort(3c), tsearch(3c).
- XX.SH DIAGNOSTICS
- XXReturns \-1 if no line matches and \fItellnext\fR is zero.
- XX.PP
- XXReturns \-2 if any
- XX.IR fseek (3s),
- XX.IR ftell (3s),
- XXor
- XX.IR malloc (3c)
- XXerror occurs.
- XXIn this case the current file location is unpredictable.
- @//E*O*F bsearchstr.3//
- chmod u=rw,g=rw,o=rw bsearchstr.3
-
- echo x - bsearchstr.c
- sed 's/^XX//' > "bsearchstr.c" <<'@//E*O*F bsearchstr.c//'
- XXstatic char Uni_id[] = "@(#)28.1";
- XX/* UNISRC_ID: @(#)bsearchstr.c 28.1 86/01/04 */
- XX/*
- XX * Binary search routine for sorted file of strings (lines).
- XX * Compile with -DDEBUG for standalone testing and extra output.
- XX */
-
- XX#include <stdio.h>
- XX#include <ctype.h>
-
- XX#define chNull ('\0')
- XX#define cpNull ((char *) NULL)
-
-
- XX#ifdef DEBUG
- XX/*
- XX * Test usage: cmd file pattern [caseins [tellnext]]
- XX *
- XX * If there is a third argument, sends caseins == 1.
- XX * If there is a fourth argument, sends tellnext == 1.
- XX */
-
- XXmain (argc, argv)
- XX int argc;
- XX char **argv;
- XX{
- XX FILE *fp;
- XX char buf[BUFSIZ];
-
- XX fp = fopen (argv[1], "r");
- XX printf ("==min=== ==max=== ==pos1== ==pos2== cmp match\n");
- XX printf ("Search returns %ld\n",
- XX bsearchstr (fp, argv[2], (argc > 3), (argc > 4)));
- XX printf ("%s\n", fgets (buf, BUFSIZ, fp));
- XX}
- XX#endif /* DEBUG */
-
-
- XX/***************************************************************************
- XX * B S E A R C H S T R
- XX *
- XX * Binary search a stream consisting of sorted lines of data for the first
- XX * line that starts with the given pattern, or the next line if none and
- XX * tellnext == 1. Set the file location to the first byte of that line
- XX * and return the offset.
- XX *
- XX * When caseins is set, takes pains not to modify the memory pointed to by
- XX * pattern. Uses multi-statement macros for efficiency.
- XX *
- XX * See bsearchstr(3) for more information.
- XX *
- XX * Author: Alan Silverstein, Hewlett-Packard Fort Collins Systems Division
- XX */
-
- XX#define LT 0
- XX#define EQ 1
- XX#define GT 2
-
- XX#define RETURN(value) { if (caseins) free (pattern); return (value); }
- XX#define SETPOS { if (fseek (filep, pos, 0) < 0) RETURN (-2L); }
-
- XXlong bsearchstr (filep, pattern, caseins, tellnext)
- XX FILE *filep; /* file to search */
- XX char *pattern; /* start of line to match */
- XX int caseins; /* case insensitive? */
- XX int tellnext; /* tell next if not found? */
- XX{
- XX /* start of first line not yet compared (lower limit) */
- XX long min = 0;
- XX /* start of line after last line not yet compared (upper limit) */
- XX long max;
-
- XX long pos; /* current offset in file */
- XX char *patp; /* pattern pointer */
- XX /* warning: this must be an int, not a char */
- XXregister int ch; /* current char to compare */
- XXregister int cmp; /* results of comparison */
- XX int match = 0; /* true if match found */
-
- XX char *malloc();
-
- XX/*
- XX * PREPARE PATTERN FOR CASE-INSENSITIVE SEARCH:
- XX *
- XX * Use toupper(), not tolower(), to be compatible with the actual function
- XX * of sort -f.
- XX */
-
- XX if (caseins)
- XX {
- XX int len = strlen (pattern) + 1; /* chars to uppercase */
- XX char *cp; /* for copying data */
- XX char *cpend; /* end of copy range */
-
- XX if ((cp = patp = malloc (len)) == cpNull)
- XX return (-2L);
-
- XX /* now remember to free (pattern) before returning */
- XX /* can use the RETURN() macro to accomplish this */
-
- XX cpend = cp + len;
-
- XX while (cp < cpend) /* copy chNull also */
- XX *cp++ = toupper (*pattern++); /* toupper() is not a macro! */
-
- XX pattern = patp; /* "move" to new location */
- XX }
-
- XX/*
- XX * SET MAX TO END OF FILE (byte past last) by seeking there:
- XX */
-
- XX if ((fseek (filep, 0L, 2) < 0) || ((max = ftell (filep)) < 0))
- XX RETURN (-2L);
-
- XX/*
- XX * SET NEXT STARTING POSITION (where to find string and compare):
- XX */
-
- XX while (min < max) /* do until closure happens */
- XX {
- XX pos = (min + max) / 2;
- XX SETPOS;
-
- XX#ifdef DEBUG
- XX printf ("%8ld %8ld %8ld ", min, max, pos);
- XX#endif
-
- XX/*
- XX * LOOK FOR THE NEXT END OF LINE IN THE FORWARD DIRECTION:
- XX */
-
- XX while ((pos < max) && (getc (filep) != '\n'))
- XX pos++;
-
- XX pos++; /* skip newline */
-
- XX/*
- XX * If we ran into max, we are nearly done, assuming the current last line is
- XX * not really huge compared to all the others (but this will work out OK too).
- XX * Simply reset pos = min and check the first current line.
- XX */
-
- XX if (pos >= max)
- XX pos = min;
-
- XX/*
- XX * COMPARE THE CURRENT LINE TO THE PATTERN:
- XX *
- XX * If pattern[0] == chNull, never does the for loop, so it matches anything.
- XX */
-
- XX SETPOS;
-
- XX for (cmp = EQ, patp = pattern; (*patp != chNull); patp++)
- XX {
- XX ch = caseins ? toupper (getc (filep)) : getc (filep);
-
- XX if ((ch < *patp) || (ch == '\n') || (ch == EOF))
- XX { /* line < pattern */
- XX cmp = LT;
- XX break;
- XX }
- XX else if (ch > *patp) /* line > pattern */
- XX {
- XX cmp = GT;
- XX break;
- XX }
- XX }
-
- XX match += (cmp == EQ); /* note a match */
-
- XX#ifdef DEBUG
- XX printf ("%8ld %c %5d\n", pos,
- XX ((cmp == LT) ? '<' : ((cmp == EQ) ? '=' : '>')), match);
- XX#endif
- XX
- XX/*
- XX * MOVE MIN OR MAX according to the results of comparison:
- XX *
- XX * If pattern[0] == chNull, cmp == EQ, which is "safe".
- XX */
-
- XX if (cmp == LT) /* min => next line */
- XX {
- XX min = pos + (patp - pattern);
-
- XX while ((ch != '\n') && (min < max)) /* find end */
- XX {
- XX ch = getc (filep);
- XX min++;
- XX }
- XX min++; /* skip newline */
- XX }
- XX else /* max => this line */
- XX {
- XX max = pos;
- XX }
-
- XX } /* while */
-
- XX/*
- XX * FINISH UP:
- XX *
- XX * Note that min could be one too large if the last line is unterminated
- XX * and less than pattern, so we use max instead.
- XX */
-
- XX if (match || tellnext) /* success */
- XX {
- XX pos = max;
- XX SETPOS;
- XX RETURN (pos);
- XX }
- XX else /* failure */
- XX {
- XX RETURN (-1L);
- XX }
-
- XX} /* bsearchstr */
- @//E*O*F bsearchstr.c//
- chmod u=rw,g=rw,o=rw bsearchstr.c
-
- echo Inspecting for damage in transit...
- temp=/tmp/sharin$$; dtemp=/tmp/sharout$$
- trap "rm -f $temp $dtemp; exit" 0 1 2 3 15
- cat > $temp <<\!!!
- 23 133 905 README
- 87 435 2678 bsearchstr.3
- 220 923 5190 bsearchstr.c
- 330 1491 8773 total
- !!!
- wc README bsearchstr.3 bsearchstr.c | sed 's=[^ ]*/==' | diff -b $temp - >$dtemp
- if test -s $dtemp
- then echo "Ouch [diff of wc output]:" ; cat $dtemp
- else echo "No problems found."
- fi
- exit 0
-